alldata <- read_csv('alldata.csv')
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X1_1 = col_double(),
## Heading = col_character(),
## Text = col_character(),
## Rating = col_double(),
## Time = col_datetime(format = ""),
## Type = col_character()
## )
iphone8 <- filter(alldata, Type == 'iphone8')
iphoneX <- filter(alldata, Type == 'iphoneX')
iphone11pm <- filter(alldata, Type == 'iphone11promax')
iPhone 8
## get text into tidy format, replace a few special words and remove stop words
reviewsTidy <- iphone8 %>%
select(X1,Text) %>%
unnest_tokens(word, Text) %>%
anti_join(stop_words)
## Joining, by = "word"
## get raw word frequencies
wordCount <- reviewsTidy %>%
count(word,sort = TRUE)
## remove common words and lemmatize remaining
commonWords <- c('iphone','phone','apple','iPhone')
reviewsTidy <- reviewsTidy %>%
mutate(lemma = lemmatize_words(word))
wordCount <- reviewsTidy %>%
count(lemma,sort = TRUE)
## remove infrequent words
freqLimit <- 20
vocab <- wordCount %>%
filter(n >= freqLimit)
reviewsTidy <- reviewsTidy %>%
filter(lemma %in% vocab$lemma) %>%
filter(!lemma %in% commonWords)
## remove very short reviews
reviewLength <- reviewsTidy %>%
count(X1)
minLength <- 5
reviewLength <- reviewLength %>%
filter(n >= minLength)
## create document term matrix for use in LDA
dtmUni <- reviewsTidy %>%
filter(X1 %in% reviewLength$X1) %>%
count(X1,lemma) %>%
cast_dtm(X1, lemma, n)
numTopics <- c(10,20,30,40)
for (theNum in c(1:length(numTopics))){
theLDA <- LDA(dtmUni, k = numTopics[theNum], method="Gibbs",
control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone8',numTopics[theNum],'.rds'))
}
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone8',theNumTopics,'.rds'))
theTopicsBeta <- tidy(theLDA, matrix = "beta")
TopicsTop <- theTopicsBeta %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ungroup() %>%
mutate(x = n():1) # for plotting
plTopicWeights8 <- TopicsTop %>%
mutate(topic=factor(topic)) %>%
ggplot(aes(x=x,y=beta,fill=topic)) +
geom_bar(stat='identity',show.legend = F) +
coord_flip() +
facet_wrap(~topic,scales='free') +
scale_x_continuous(breaks = TopicsTop$x,
labels = TopicsTop$term,
expand = c(0,0)) +
labs(title='Top Words by Topic',
subtitle = paste0(theNumTopics,' Topic LDA of ',
prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 8 Reviews'),
caption = 'Note: The words "iPhone", "Apple" and "Phone" and reviews less than 5 words long have been removed.',
x = 'word',
y = 'beta')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
axis.text.y = element_text(size = 6))
plTopicWeights8





assignments <- augment(theLDA, data = dtmUni)
theDocID <- '23'
theDoc <- assignments %>%
filter(document == theDocID)
tmp <- reviewsTidy %>%
filter(X1 == theDocID) %>%
left_join(select(theDoc,term,.topic), by = c('lemma'='term')) %>%
distinct()
theOrg <- iphone8 %>%
filter(X1==theDocID) %>%
select(X1,Text) %>%
unnest_tokens(word,Text) %>%
left_join(select(tmp,word,.topic), by = 'word') %>%
mutate(wordID = row_number())
theBreaks <- c(1:10)
theY <- c(100:1)
dfIndex <- data.frame( y = rep(theY,each = length(theBreaks)),
x = rep(theBreaks, length(theY)) ) %>%
mutate(wordID = row_number())
theOrg %>%
left_join(dfIndex, by = 'wordID') %>%
ggplot(aes(x=factor(x),y=y,label=word,color=factor(.topic))) +
geom_text() +
theme_bw() +
labs(x = '', y = '', title = paste0('ReviewID ',theDocID)) +
scale_color_discrete(name="Topic") +
theme(panel.grid.minor=element_blank(),
panel.grid.major=element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank())

## @knitr ReviewClustering
theTopicsGamma <- tidy(theLDA, matrix = "gamma")
theSampleReviews <- reviewLength %>%
sample_n(5)
theTopicsGamma %>%
filter(document %in% theSampleReviews$X1) %>%
ggplot(aes(x=topic,y=gamma,fill=document)) +
geom_bar(stat='identity') +
facet_wrap(~document,ncol = 1) +
theme(legend.position = 'none') +
scale_y_continuous(labels = percent) +
labs(title = '5 Random Reviews',
y = 'Topic Weight (Gamma)')

## @knitr TopicEvolution
iphone8 <- iphone8 %>%
mutate(ID = as.character(X1))
theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
inner_join(iphone8,by=c('document'='ID'))
theTopicsGamma %>%
group_by(topic, Year = year(Time)) %>%
summarize(mean = mean(gamma)) %>%
ggplot(aes(x=Year,y=mean,group=topic)) + geom_line() +
facet_wrap(~topic,labeller = label_both) +
scale_y_continuous(labels = percent) +
labs(title = 'Topic Evolution', x = 'Year of Review', y = 'Average Topic Weight') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

## @knitr TopicSentiments
theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
inner_join(iphone8,by=c('document'='ID'))
theTopicsGamma %>%
group_by(Rating,topic) %>%
summarize(mean = mean(gamma)) %>%
mutate(topic = factor(topic)) %>%
ggplot(aes(x=Rating,y=mean,fill=topic)) +
geom_bar(stat='identity') +
facet_wrap(~topic, scales = 'free', labeller = label_both) +
scale_y_continuous(labels = percent) +
theme(legend.position = 'none') +
labs(title = 'Topic Weights by Star Rating', x = 'Rating', y = 'Average Topic Weight')

## @knitr LDAAriaUniandBigrams
reviewsTidyUni <- reviewsTidy %>%
group_by(X1) %>%
mutate(wordNumber = row_number()) %>%
ungroup()
plTopicWeights8

## all reviews
tmpUni <- reviewsTidyUni %>%
rename(lemma1 = lemma) %>%
mutate(lemma2 = lead(lemma1),
Index1 = wordNumber,
Index2 = lead(wordNumber),
bilemma = paste0(lemma1,'_',lemma2))
BiLimit <- 100
freqBi <- tmpUni %>%
count(bilemma,sort = T) %>%
filter(n >= BiLimit)
newBi <- tmpUni %>%
filter(bilemma %in% freqBi$bilemma)
tmpRemoveRows <- newBi %>%
select(Index1,Index2,bilemma,X1) %>%
gather(Index,wordNumber,-bilemma,-X1) %>%
select(X1,wordNumber)
newBi <- newBi %>%
select(X1,bilemma) %>%
rename(lemma1 = bilemma)
reviewsTidyUniBi <- tmpUni %>%
anti_join(tmpRemoveRows,by = c('X1','wordNumber')) %>%
select(X1,lemma1) %>%
bind_rows(newBi)
vocab <- reviewsTidyUniBi %>%
count(lemma1,sort = T) %>%
filter(n >= 20)
reviewsTidyUniBi <- reviewsTidyUniBi %>%
filter(lemma1 %in% vocab$lemma1)
## remove very short reviews
reviewLength <- reviewsTidyUniBi %>%
count(X1)
minLength <- 5
reviewLength <- reviewLength %>%
filter(n >= minLength)
## create document term matrix for use in LDA
dtmBi <- reviewsTidyUniBi %>%
filter(X1 %in% reviewLength$X1) %>%
count(X1,lemma1) %>%
cast_dtm(X1, lemma1, n)
numTopics <- c(10,20,30,40)
for (theNum in c(1:length(numTopics))){
theLDA <- LDA(dtmBi, k = numTopics[theNum], method="Gibbs",
control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone8_Bi',numTopics[theNum],'.rds'))
}
## @knitr AnalyzeTopicsUniBi
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone8_Bi',theNumTopics,'.rds'))
theTopicsBeta <- tidy(theLDA, matrix = "beta")
TopicsTop <- theTopicsBeta %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ungroup() %>%
mutate(x = n():1) # for plotting
plTopicWeights8_2 <- TopicsTop %>%
mutate(topic=factor(topic)) %>%
ggplot(aes(x=x,y=beta,fill=topic)) +
geom_bar(stat='identity',show.legend = F) +
coord_flip() +
facet_wrap(~topic,scales='free') +
scale_x_continuous(breaks = TopicsTop$x,
labels = TopicsTop$term,
expand = c(0,0)) +
labs(title='Topic Model with both Unigrams and Bigrams',
subtitle = paste0(theNumTopics,' Topic LDA of ',
prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 8 Reviews'),
x = 'word',
y = 'beta')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
axis.text.y = element_text(size = 6))
plTopicWeights8_2

theTopicsBetaW <- select(spread(tidy(theLDA, matrix = "beta"),term,beta),-topic)
theTopicsGammaW <- select(spread(tidy(theLDA, matrix = "gamma"),topic,gamma),-document)
theTerms <- colnames(theTopicsBetaW)
theVocab <- vocab %>%
mutate(word = factor(lemma1,levels=theTerms)) %>%
arrange(word) %>%
mutate(word=as.character(word))
json <- createJSON(
phi = theTopicsBetaW,
theta = theTopicsGammaW,
doc.length = reviewLength$n,
vocab = theTerms,
R = theNumTopics,
term.frequency = theVocab$n
)
serVis(json)
## Loading required namespace: servr
iPhone X
## get text into tidy format, replace a few special words and remove stop words
reviewsTidy <- iphoneX %>%
select(X1,Text) %>%
unnest_tokens(word, Text) %>%
anti_join(stop_words)
## Joining, by = "word"
## get raw word frequencies
wordCount <- reviewsTidy %>%
count(word,sort = TRUE)
## remove common words and lemmatize remaining
commonWords <- c('iphone','phone','apple','iPhone')
reviewsTidy <- reviewsTidy %>%
mutate(lemma = lemmatize_words(word))
wordCount <- reviewsTidy %>%
count(lemma,sort = TRUE)
## remove infrequent words
freqLimit <- 20
vocab <- wordCount %>%
filter(n >= freqLimit)
reviewsTidy <- reviewsTidy %>%
filter(lemma %in% vocab$lemma) %>%
filter(!lemma %in% commonWords)
## remove very short reviews
reviewLength <- reviewsTidy %>%
count(X1)
minLength <- 5
reviewLength <- reviewLength %>%
filter(n >= minLength)
## create document term matrix for use in LDA
dtmUni <- reviewsTidy %>%
filter(X1 %in% reviewLength$X1) %>%
count(X1,lemma) %>%
cast_dtm(X1, lemma, n)
numTopics <- c(10,20,30,40)
for (theNum in c(1:length(numTopics))){
theLDA <- LDA(dtmUni, k = numTopics[theNum], method="Gibbs",
control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
saveRDS(theLDA,file=paste0('topicmodels/ldaiPhoneX',numTopics[theNum],'.rds'))
}
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhoneX',theNumTopics,'.rds'))
theTopicsBeta <- tidy(theLDA, matrix = "beta")
TopicsTop <- theTopicsBeta %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ungroup() %>%
mutate(x = n():1) # for plotting
plTopicWeightsX <- TopicsTop %>%
mutate(topic=factor(topic)) %>%
ggplot(aes(x=x,y=beta,fill=topic)) +
geom_bar(stat='identity',show.legend = F) +
coord_flip() +
facet_wrap(~topic,scales='free') +
scale_x_continuous(breaks = TopicsTop$x,
labels = TopicsTop$term,
expand = c(0,0)) +
labs(title='Top Words by Topic',
subtitle = paste0(theNumTopics,' Topic LDA of ',
prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone X Reviews'),
caption = 'Note: The words "iPhone", "Apple" and "Phone" and reviews less than 5 words long have been removed.',
x = 'word',
y = 'beta')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
axis.text.y = element_text(size = 6))
plTopicWeightsX





assignments <- augment(theLDA, data = dtmUni)
theDocID <- '23'
theDoc <- assignments %>%
filter(document == theDocID)
tmp <- reviewsTidy %>%
filter(X1 == theDocID) %>%
left_join(select(theDoc,term,.topic), by = c('lemma'='term')) %>%
distinct()
theOrg <- iphoneX %>%
filter(X1==theDocID) %>%
select(X1,Text) %>%
unnest_tokens(word,Text) %>%
left_join(select(tmp,word,.topic), by = 'word') %>%
mutate(wordID = row_number())
theBreaks <- c(1:10)
theY <- c(100:1)
dfIndex <- data.frame( y = rep(theY,each = length(theBreaks)),
x = rep(theBreaks, length(theY)) ) %>%
mutate(wordID = row_number())
theOrg %>%
left_join(dfIndex, by = 'wordID') %>%
ggplot(aes(x=factor(x),y=y,label=word,color=factor(.topic))) +
geom_text() +
theme_bw() +
labs(x = '', y = '', title = paste0('ReviewID ',theDocID)) +
scale_color_discrete(name="Topic") +
theme(panel.grid.minor=element_blank(),
panel.grid.major=element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank())

## @knitr ReviewClustering
theTopicsGamma <- tidy(theLDA, matrix = "gamma")
theSampleReviews <- reviewLength %>%
sample_n(5)
theTopicsGamma %>%
filter(document %in% theSampleReviews$X1) %>%
ggplot(aes(x=topic,y=gamma,fill=document)) +
geom_bar(stat='identity') +
facet_wrap(~document,ncol = 1) +
theme(legend.position = 'none') +
scale_y_continuous(labels = percent) +
labs(title = '5 Random Reviews',
y = 'Topic Weight (Gamma)')

## @knitr TopicEvolution
iphoneX <- iphoneX %>%
mutate(ID = as.character(X1))
theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
inner_join(iphoneX,by=c('document'='ID'))
theTopicsGamma %>%
group_by(topic, Year = year(Time)) %>%
summarize(mean = mean(gamma)) %>%
ggplot(aes(x=Year,y=mean,group=topic)) + geom_line() +
facet_wrap(~topic,labeller = label_both) +
scale_y_continuous(labels = percent) +
labs(title = 'Topic Evolution', x = 'Year of Review', y = 'Average Topic Weight') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

## @knitr TopicSentiments
theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
inner_join(iphoneX,by=c('document'='ID'))
theTopicsGamma %>%
group_by(Rating,topic) %>%
summarize(mean = mean(gamma)) %>%
mutate(topic = factor(topic)) %>%
ggplot(aes(x=Rating,y=mean,fill=topic)) +
geom_bar(stat='identity') +
facet_wrap(~topic, scales = 'free', labeller = label_both) +
scale_y_continuous(labels = percent) +
theme(legend.position = 'none') +
labs(title = 'Topic Weights by Star Rating', x = 'Rating', y = 'Average Topic Weight')

## @knitr LDAAriaUniandBigrams
reviewsTidyUni <- reviewsTidy %>%
group_by(X1) %>%
mutate(wordNumber = row_number()) %>%
ungroup()
plTopicWeightsX

## all reviews
tmpUni <- reviewsTidyUni %>%
rename(lemma1 = lemma) %>%
mutate(lemma2 = lead(lemma1),
Index1 = wordNumber,
Index2 = lead(wordNumber),
bilemma = paste0(lemma1,'_',lemma2))
BiLimit <- 100
freqBi <- tmpUni %>%
count(bilemma,sort = T) %>%
filter(n >= BiLimit)
newBi <- tmpUni %>%
filter(bilemma %in% freqBi$bilemma)
tmpRemoveRows <- newBi %>%
select(Index1,Index2,bilemma,X1) %>%
gather(Index,wordNumber,-bilemma,-X1) %>%
select(X1,wordNumber)
newBi <- newBi %>%
select(X1,bilemma) %>%
rename(lemma1 = bilemma)
reviewsTidyUniBi <- tmpUni %>%
anti_join(tmpRemoveRows,by = c('X1','wordNumber')) %>%
select(X1,lemma1) %>%
bind_rows(newBi)
vocab <- reviewsTidyUniBi %>%
count(lemma1,sort = T) %>%
filter(n >= 20)
reviewsTidyUniBi <- reviewsTidyUniBi %>%
filter(lemma1 %in% vocab$lemma1)
## remove very short reviews
reviewLength <- reviewsTidyUniBi %>%
count(X1)
minLength <- 5
reviewLength <- reviewLength %>%
filter(n >= minLength)
## create document term matrix for use in LDA
dtmBi <- reviewsTidyUniBi %>%
filter(X1 %in% reviewLength$X1) %>%
count(X1,lemma1) %>%
cast_dtm(X1, lemma1, n)
numTopics <- c(10,20,30,40)
for (theNum in c(1:length(numTopics))){
theLDA <- LDA(dtmBi, k = numTopics[theNum], method="Gibbs",
control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
saveRDS(theLDA,file=paste0('topicmodels/ldaiPhoneX_Bi',numTopics[theNum],'.rds'))
}
## @knitr AnalyzeTopicsUniBi
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhoneX_Bi',theNumTopics,'.rds'))
theTopicsBeta <- tidy(theLDA, matrix = "beta")
TopicsTop <- theTopicsBeta %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ungroup() %>%
mutate(x = n():1) # for plotting
plTopicWeightsX_2 <- TopicsTop %>%
mutate(topic=factor(topic)) %>%
ggplot(aes(x=x,y=beta,fill=topic)) +
geom_bar(stat='identity',show.legend = F) +
coord_flip() +
facet_wrap(~topic,scales='free') +
scale_x_continuous(breaks = TopicsTop$x,
labels = TopicsTop$term,
expand = c(0,0)) +
labs(title='Topic Model with both Unigrams and Bigrams',
subtitle = paste0(theNumTopics,' Topic LDA of ',
prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone X Reviews'),
x = 'word',
y = 'beta')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
axis.text.y = element_text(size = 6))
plTopicWeightsX_2

theTopicsBetaW <- select(spread(tidy(theLDA, matrix = "beta"),term,beta),-topic)
theTopicsGammaW <- select(spread(tidy(theLDA, matrix = "gamma"),topic,gamma),-document)
theTerms <- colnames(theTopicsBetaW)
theVocab <- vocab %>%
mutate(word = factor(lemma1,levels=theTerms)) %>%
arrange(word) %>%
mutate(word=as.character(word))
json <- createJSON(
phi = theTopicsBetaW,
theta = theTopicsGammaW,
doc.length = reviewLength$n,
vocab = theTerms,
R = theNumTopics,
term.frequency = theVocab$n
)
serVis(json)
iPhone 11 Pro Max
## get text into tidy format, replace a few special words and remove stop words
reviewsTidy <- iphone11pm %>%
select(X1,Text) %>%
unnest_tokens(word, Text) %>%
anti_join(stop_words)
## Joining, by = "word"
## get raw word frequencies
wordCount <- reviewsTidy %>%
count(word,sort = TRUE)
## remove common words and lemmatize remaining
commonWords <- c('iphone','phone','apple','iPhone')
reviewsTidy <- reviewsTidy %>%
mutate(lemma = lemmatize_words(word))
wordCount <- reviewsTidy %>%
count(lemma,sort = TRUE)
## remove infrequent words
freqLimit <- 20
vocab <- wordCount %>%
filter(n >= freqLimit)
reviewsTidy <- reviewsTidy %>%
filter(lemma %in% vocab$lemma) %>%
filter(!lemma %in% commonWords)
## remove very short reviews
reviewLength <- reviewsTidy %>%
count(X1)
minLength <- 5
reviewLength <- reviewLength %>%
filter(n >= minLength)
## create document term matrix for use in LDA
dtmUni <- reviewsTidy %>%
filter(X1 %in% reviewLength$X1) %>%
count(X1,lemma) %>%
cast_dtm(X1, lemma, n)
numTopics <- c(10,20,30,40)
for (theNum in c(1:length(numTopics))){
theLDA <- LDA(dtmUni, k = numTopics[theNum], method="Gibbs",
control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone11pm',numTopics[theNum],'.rds'))
}
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone11pm',theNumTopics,'.rds'))
theTopicsBeta <- tidy(theLDA, matrix = "beta")
TopicsTop <- theTopicsBeta %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ungroup() %>%
mutate(x = n():1) # for plotting
plTopicWeights11 <- TopicsTop %>%
mutate(topic=factor(topic)) %>%
ggplot(aes(x=x,y=beta,fill=topic)) +
geom_bar(stat='identity',show.legend = F) +
coord_flip() +
facet_wrap(~topic,scales='free') +
scale_x_continuous(breaks = TopicsTop$x,
labels = TopicsTop$term,
expand = c(0,0)) +
labs(title='Top Words by Topic',
subtitle = paste0(theNumTopics,' Topic LDA of ',
prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 11 Pro Max Reviews'),
caption = 'Note: The words "iPhone", "Apple" and "Phone" and reviews less than 5 words long have been removed.',
x = 'word',
y = 'beta')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
axis.text.y = element_text(size = 6))
plTopicWeights11





assignments <- augment(theLDA, data = dtmUni)
theDocID <- '2519'
theDoc <- assignments %>%
filter(document == theDocID)
tmp <- reviewsTidy %>%
filter(X1 == theDocID) %>%
left_join(select(theDoc,term,.topic), by = c('lemma'='term')) %>%
distinct()
theOrg <- iphone11pm %>%
filter(X1==theDocID) %>%
select(X1,Text) %>%
unnest_tokens(word,Text) %>%
left_join(select(tmp,word,.topic), by = 'word') %>%
mutate(wordID = row_number())
theBreaks <- c(1:10)
theY <- c(100:1)
dfIndex <- data.frame( y = rep(theY,each = length(theBreaks)),
x = rep(theBreaks, length(theY)) ) %>%
mutate(wordID = row_number())
theOrg %>%
left_join(dfIndex, by = 'wordID') %>%
ggplot(aes(x=factor(x),y=y,label=word,color=factor(.topic))) +
geom_text() +
theme_bw() +
labs(x = '', y = '', title = paste0('ReviewID ',theDocID)) +
scale_color_discrete(name="Topic") +
theme(panel.grid.minor=element_blank(),
panel.grid.major=element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank())

## @knitr ReviewClustering
theTopicsGamma <- tidy(theLDA, matrix = "gamma")
theSampleReviews <- reviewLength %>%
sample_n(5)
theTopicsGamma %>%
filter(document %in% theSampleReviews$X1) %>%
ggplot(aes(x=topic,y=gamma,fill=document)) +
geom_bar(stat='identity') +
facet_wrap(~document,ncol = 1) +
theme(legend.position = 'none') +
scale_y_continuous(labels = percent) +
labs(title = '5 Random Reviews',
y = 'Topic Weight (Gamma)')

## @knitr TopicEvolution
iphone11pm <- iphone11pm %>%
mutate(ID = as.character(X1))
theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
inner_join(iphone11pm,by=c('document'='ID'))
theTopicsGamma %>%
group_by(topic, Year = year(Time)) %>%
summarize(mean = mean(gamma)) %>%
ggplot(aes(x=Year,y=mean,group=topic)) + geom_line() +
facet_wrap(~topic,labeller = label_both) +
scale_y_continuous(labels = percent) +
labs(title = 'Topic Evolution', x = 'Year of Review', y = 'Average Topic Weight') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

## @knitr TopicSentiments
theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
inner_join(iphone11pm,by=c('document'='ID'))
theTopicsGamma %>%
group_by(Rating,topic) %>%
summarize(mean = mean(gamma)) %>%
mutate(topic = factor(topic)) %>%
ggplot(aes(x=Rating,y=mean,fill=topic)) +
geom_bar(stat='identity') +
facet_wrap(~topic, scales = 'free', labeller = label_both) +
scale_y_continuous(labels = percent) +
theme(legend.position = 'none') +
labs(title = 'Topic Weights by Star Rating', x = 'Rating', y = 'Average Topic Weight')

## @knitr LDAAriaUniandBigrams
reviewsTidyUni <- reviewsTidy %>%
group_by(X1) %>%
mutate(wordNumber = row_number()) %>%
ungroup()
plTopicWeights11

## all reviews
tmpUni <- reviewsTidyUni %>%
rename(lemma1 = lemma) %>%
mutate(lemma2 = lead(lemma1),
Index1 = wordNumber,
Index2 = lead(wordNumber),
bilemma = paste0(lemma1,'_',lemma2))
BiLimit <- 100
freqBi <- tmpUni %>%
count(bilemma,sort = T) %>%
filter(n >= BiLimit)
newBi <- tmpUni %>%
filter(bilemma %in% freqBi$bilemma)
tmpRemoveRows <- newBi %>%
select(Index1,Index2,bilemma,X1) %>%
gather(Index,wordNumber,-bilemma,-X1) %>%
select(X1,wordNumber)
newBi <- newBi %>%
select(X1,bilemma) %>%
rename(lemma1 = bilemma)
reviewsTidyUniBi <- tmpUni %>%
anti_join(tmpRemoveRows,by = c('X1','wordNumber')) %>%
select(X1,lemma1) %>%
bind_rows(newBi)
vocab <- reviewsTidyUniBi %>%
count(lemma1,sort = T) %>%
filter(n >= 20)
reviewsTidyUniBi <- reviewsTidyUniBi %>%
filter(lemma1 %in% vocab$lemma1)
## remove very short reviews
reviewLength <- reviewsTidyUniBi %>%
count(X1)
minLength <- 5
reviewLength <- reviewLength %>%
filter(n >= minLength)
## create document term matrix for use in LDA
dtmBi <- reviewsTidyUniBi %>%
filter(X1 %in% reviewLength$X1) %>%
count(X1,lemma1) %>%
cast_dtm(X1, lemma1, n)
numTopics <- c(10,20,30,40)
for (theNum in c(1:length(numTopics))){
theLDA <- LDA(dtmBi, k = numTopics[theNum], method="Gibbs",
control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone11pm_Bi',numTopics[theNum],'.rds'))
}
## @knitr AnalyzeTopicsUniBi
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone11pm_Bi',theNumTopics,'.rds'))
theTopicsBeta <- tidy(theLDA, matrix = "beta")
TopicsTop <- theTopicsBeta %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ungroup() %>%
mutate(x = n():1) # for plotting
plTopicWeights11_2 <- TopicsTop %>%
mutate(topic=factor(topic)) %>%
ggplot(aes(x=x,y=beta,fill=topic)) +
geom_bar(stat='identity',show.legend = F) +
coord_flip() +
facet_wrap(~topic,scales='free') +
scale_x_continuous(breaks = TopicsTop$x,
labels = TopicsTop$term,
expand = c(0,0)) +
labs(title='Topic Model with both Unigrams and Bigrams',
subtitle = paste0(theNumTopics,' Topic LDA of ',
prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 11 Pro Max Reviews'),
x = 'word',
y = 'beta')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
axis.text.y = element_text(size = 6))
plTopicWeights11_2

theTopicsBetaW <- select(spread(tidy(theLDA, matrix = "beta"),term,beta),-topic)
theTopicsGammaW <- select(spread(tidy(theLDA, matrix = "gamma"),topic,gamma),-document)
theTerms <- colnames(theTopicsBetaW)
theVocab <- vocab %>%
mutate(word = factor(lemma1,levels=theTerms)) %>%
arrange(word) %>%
mutate(word=as.character(word))
json <- createJSON(
phi = theTopicsBetaW,
theta = theTopicsGammaW,
doc.length = reviewLength$n,
vocab = theTerms,
R = theNumTopics,
term.frequency = theVocab$n
)
serVis(json)